### Context ###

rm(list=ls())
library(tidyverse)
library(conText)
library(quanteda)
# --------------------------------
setwd("C:/Users/avane/Dropbox/UNC/Research/Dissertation/Chapter3/Replication")
# --------------------------------
# load corpus
# --------------------------------

load("uy.Rda")
load("ch.Rda")
load("arg.Rda")

#Uruguay
uy <- uy |> 
  mutate(year= year(date)) |>
  mutate(body = tolower(body)) %>%
  distinct(body, .keep_all = TRUE) %>% tidyr::drop_na()

# quanteda corpus
uy_corpus <- corpus(uy$body, docvars = data.frame(party = uy$left, 
                                                  year = uy$year))

#Arg
arg <- arg |>
  mutate(year= year(date)) |> 
  select(body, left, year) %>%
    mutate(body = tolower(body)) %>%
  distinct(body, .keep_all = TRUE) %>% tidyr::drop_na()

# quanteda corpus
arg_corpus <- corpus(arg$body, docvars = data.frame(party = arg$left, 
                                                  year = arg$year))


#Chile
ch <- ch |> 
  mutate(year= year(date2)) |> 
  mutate(body = tolower(body)) %>%
  distinct(body, .keep_all = TRUE) %>% tidyr::drop_na()

# quanteda corpus
ch_corpus <- corpus(ch$body, docvars = data.frame(party = ch$left, 
                                                  year = ch$year))

# pre-trained embeddings & transformation matrix
# fastText pretrained embeddings
# --------------------------------
not_all_na <- function(x) any(!is.na(x))
fasttext <-  setDT(read_delim("Data/glove_vectors_eswiki.txt",
                              delim = " ",
                              quote = "",
                              skip = 1,
                              col_names = F,
                              col_types = cols())) %>%
  dplyr::select(where(not_all_na)) # remove last column which is all NA

pre_trained  <-  as.matrix(fasttext, rownames = 1)
colnames(pre_trained) = NULL
rm(fasttext)

#transform_matrix

transform <- readRDS("Data/glove_transform_eswiki_25.rds")

# --------------------------------
# pre-processing
# --------------------------------
uy_toks <- tokens(uy_corpus, remove_punct = T, remove_symbols = T, 
                  remove_numbers = T, remove_url = T, remove_separators = T)%>%
  tokens_remove(c ("*minist*", "*secretar*", "fonavi", "anep", 
                   "mides", "mevir", "mtop", stopwords("spanish")))


arg_toks <- tokens(arg_corpus, remove_punct = T, remove_symbols = T, 
                  remove_numbers = T, remove_url = T, remove_separators = T)%>%
  tokens_remove(c ("*minist*", "*secretar*", "cristina", "macri", stopwords("spanish")))


ch_toks <- tokens(ch_corpus, remove_punct = T, remove_symbols = T, 
                  remove_numbers = T, remove_url = T, remove_separators = T) %>%
  tokens_remove(c ("*minist*", "*secretar*", "fonavi", "anep", 
                   "mides", "mevir", "mtop", stopwords("spanish")))
                           

sip_toks <- tokens_context(x = arg_toks, pattern = c("alumn*", "estudiant*"), 
                           window = 6L)

sip_ed<-get_cos_sim(x = sip_toks,
                 groups = docvars(sip_toks, 'party'),
                 features = c( "incluir", "igualdad", "pobres",
                              "producción", "crecimiento", "innovación", "desarrollo"),
                 pre_trained = pre_trained,
                 transform = TRUE,
                 transform_matrix = transform,
                 bootstrap = TRUE,
                 num_bootstraps = 100,
                 as_list = FALSE, 
                 confidence_level = 0.95)

sip_toks <- tokens_context(x = arg_toks, pattern = c("trabajad*", "emple*"), 
                           window = 6L)

sip_t<-get_cos_sim(x = sip_toks,
                    groups = docvars(sip_toks, 'party'),
                    features = c( "incluir", "igualdad", "pobres",
                                  "producción", "crecimiento", "innovación", "desarrollo"),
                    pre_trained = pre_trained,
                    transform = TRUE,
                    transform_matrix = transform,
                    bootstrap = TRUE,
                    num_bootstraps = 100,
                    as_list = FALSE,
                   confidence_level = 0.95)


sip_toks <- tokens_context(x = arg_toks, pattern = c("vivienda"), 
                           window = 6L)

sip_v<-get_cos_sim(x = sip_toks,
                    groups = docvars(sip_toks, 'party'),
                    features = c( "incluir", "igualdad", "pobres",
                                  "producción", "crecimiento", "innovación", "desarrollo"),
                    pre_trained = pre_trained,
                    transform = TRUE,
                    transform_matrix = transform,
                    bootstrap = TRUE,
                    num_bootstraps = 100,
                    as_list = FALSE,
                   confidence_level = 0.95)

sip_ed$policy<- "education"
sip_t$policy<- "labor"
sip_v$policy<- "housing"

sip_arg<-rbind(sip_ed, sip_t, sip_v)
sip_arg$country<- "Argentina"


#Chile


sip_toks <- tokens_context(x = ch_toks, pattern = c("alumn*"), 
                           window = 6L)

sip_ed<-get_cos_sim(x = sip_toks,
                    groups = docvars(sip_toks, 'party'),
                    features = c( "incluir", "igualdad", "pobres",
                                  "producción", "crecimiento", "innovación", "desarrollo"),
                    pre_trained = pre_trained,
                    transform = TRUE,
                    transform_matrix = transform,
                    bootstrap = TRUE,
                    num_bootstraps = 100,
                    as_list = FALSE,
                    confidence_level = 0.95)

sip_toks <- tokens_context(x = ch_toks, pattern = c("trabajad*", "emple*"), 
                           window = 6L)

sip_t<-get_cos_sim(x = sip_toks,
                   groups = docvars(sip_toks, 'party'),
                   features = c( "incluir", "igualdad", "pobres",
                                 "producción", "crecimiento", "innovación", "desarrollo"),
                   pre_trained = pre_trained,
                   transform = TRUE,
                   transform_matrix = transform,
                   bootstrap = TRUE,
                   num_bootstraps = 100,
                   as_list = FALSE,
                   confidence_level = 0.95)


sip_toks <- tokens_context(x = ch_toks, pattern = c("vivienda*"), 
                           window = 6L)

sip_v<-get_cos_sim(x = sip_toks,
                   groups = docvars(sip_toks, 'party'),
                   features = c( "incluir", "igualdad", "pobres",
                                 "producción", "crecimiento", "innovación", "desarrollo"),
                   pre_trained = pre_trained,
                   transform = TRUE,
                   transform_matrix = transform,
                   bootstrap = TRUE,
                   num_bootstraps = 100,
                   as_list = FALSE,
                   confidence_level = 0.95)

sip_ed$policy<- "education"
sip_t$policy<- "labor"
sip_v$policy<- "housing"

sip_ch<-rbind(sip_ed, sip_t, sip_v)
sip_ch$country<- "Chile"



#Uruguay


sip_toks <- tokens_context(x = uy_toks, pattern = c( "estudiant*"), 
                           window = 6L)

sip_ed<-get_cos_sim(x = sip_toks,
                    groups = docvars(sip_toks, 'party'),
                    features = c( "incluir", "igualdad", "pobres",
                                  "producción", "crecimiento", "innovación", "desarrollo"),
                    pre_trained = pre_trained,
                    transform = TRUE,
                    transform_matrix = transform,
                    bootstrap = TRUE,
                    num_bootstraps = 100,
                    as_list = FALSE,
                    confidence_level = 0.95)

sip_toks <- tokens_context(x = uy_toks, pattern = c("trabajad*", "emple*"), 
                           window = 6L)

sip_t<-get_cos_sim(x = sip_toks,
                   groups = docvars(sip_toks, 'party'),
                   features = c( "incluir", "igualdad", "pobres",
                                 "producción", "crecimiento", "innovación", "desarrollo"),
                   pre_trained = pre_trained,
                   transform = TRUE,
                   transform_matrix = transform,
                   bootstrap = TRUE,
                   num_bootstraps = 100,
                   as_list = FALSE,
                   confidence_level = 0.95)


sip_toks <- tokens_context(x = uy_toks, pattern = c("vivienda*"), 
                           window = 6L)

sip_v<-get_cos_sim(x = sip_toks,
                   groups = docvars(sip_toks, 'party'),
                   features = c( "incluir", "igualdad", "pobres",
                                 "producción", "crecimiento", "innovación", "desarrollo"),
                   pre_trained = pre_trained,
                   transform = TRUE,
                   transform_matrix = transform,
                   bootstrap = TRUE,
                   num_bootstraps = 100,
                   as_list = FALSE,
                   confidence_level = 0.95)

sip_ed$policy<- "education"
sip_t$policy<- "labor"
sip_v$policy<- "housing"

sip_uy<-rbind(sip_ed, sip_t, sip_v)
sip_uy$country<- "Uruguay"



